function graphs_intro(data,col_long,fp,paper_dir)

% Make some boxplots and tables for the introduction, linking child study 
% time, LW scores and parental characteristics, by child age category.

cd(paper_dir)

% define child age categories
beg_age = [3 6 9 13];
end_age = [5 8 12 16];

% Create hh income variable, averaged within each age category
tmp_hhinc_all = nan(size(data,1),2);

for i = 1:fp.N
    tmp_rows_i = 1 + (i-1) * (fp.M+1) : i * (fp.M+1);
    tmp_data_i = data(tmp_rows_i,:);
    tmp_new = nan(fp.M+1,1);
    for t = 1:4
        row_ind = (tmp_data_i(:,col_long.age) >= beg_age(t) & tmp_data_i(:,col_long.age) <= end_age(t));
        tmp_hhinc_it = tmp_data_i(row_ind,col_long.mom_wage).*tmp_data_i(row_ind,col_long.mom_hours) + ...
            tmp_data_i(row_ind,col_long.dad_wage).*tmp_data_i(row_ind,col_long.dad_hours) + ...
            tmp_data_i(row_ind,col_long.nonlabor_income);
        tmp_avg = nanmean(tmp_hhinc_it);
        tmp_new(row_ind,1) = tmp_avg; % repmat
    end
    tmp_hhinc_all(tmp_rows_i,1) = tmp_new;
end

tmp_hhinc_all(:,2) = data(:,col_long.mom_wage).*data(:,col_long.mom_hours) + ...
            data(:,col_long.dad_wage).*data(:,col_long.dad_hours) + ...
            data(:,col_long.nonlabor_income);  

% Fraction child study time / total time
tmp_tauc_all = data(:,col_long.tau_c);
tmp_tau1_all = data(:,col_long.tau1);
tmp_tau2_all = data(:,col_long.tau2);
frac_tauc_all = tmp_tauc_all ./ (tmp_tauc_all + tmp_tau1_all + tmp_tau2_all);

table = NaN(10,4);
se_table = NaN(10,4); % standard errors of correlations
% Note: SE of a correlation is sqrt( (1-r^2) / (n-2) )

for t = 1:4
    % select the rows we need for this age group
    row_ind = (data(:,col_long.age) >= beg_age(t) & data(:,col_long.age) <= end_age(t));
    
    tmp_tauc = data(row_ind,col_long.tau_c);
    tmp_lw = data(row_ind,col_long.lw_raw);
    tmp_momeduc = data(row_ind,col_long.mom_educ);
    tmp_dadeduc = data(row_ind,col_long.dad_educ);
    tmp_hhinc = tmp_hhinc_all(row_ind,1);
    tmp_frac_tauc = frac_tauc_all(row_ind,1);
    
    % mean study time by child age category
    table(1,t) = nanmean(tmp_tauc);
    % mean test score by child age category
    table(2,t) = nanmean(tmp_lw);
    
    % corr(tau_c, mom_educ) by child age category
    temp1 = tmp_tauc;
    temp2 = tmp_momeduc;
    temp = nancov(temp1,temp2,1);
    table(3,t) = temp(1,2)...
    ./(nanstd(temp1(isnan(temp1) == 0 & isnan(temp2) == 0),1) ...
    .* nanstd(temp2(isnan(temp1) == 0 & isnan(temp2) == 0),1));
    % SE of corr
    tmp_n = nansum(isnan(temp1) == 0 & isnan(temp2) == 0);
    se_table(3,t) = sqrt((1-table(3,t)^2)/(tmp_n-2));

    % corr(tau_c, dad_educ) by child age category
    temp1 = tmp_tauc;
    temp2 = tmp_dadeduc;
    temp = nancov(temp1,temp2,1);
    table(4,t) = temp(1,2)...
    ./(nanstd(temp1(isnan(temp1) == 0 & isnan(temp2) == 0),1) ...
    .* nanstd(temp2(isnan(temp1) == 0 & isnan(temp2) == 0),1));
    % SE of corr
    tmp_n = nansum(isnan(temp1) == 0 & isnan(temp2) == 0);
    se_table(4,t) = sqrt((1-table(4,t)^2)/(tmp_n-2));

    % corr(tau_c, Y) by child age category
    temp1 = tmp_tauc;
    temp2 = tmp_hhinc;
    temp = nancov(temp1,temp2,1);
    table(5,t) = temp(1,2)...
    ./(nanstd(temp1(isnan(temp1) == 0 & isnan(temp2) == 0),1) ...
    .* nanstd(temp2(isnan(temp1) == 0 & isnan(temp2) == 0),1));
    % SE of corr
    tmp_n = nansum(isnan(temp1) == 0 & isnan(temp2) == 0);
    se_table(5,t) = sqrt((1-table(5,t)^2)/(tmp_n-2));

    % corr(LW, mom_educ) by child age category
    temp1 = tmp_lw;
    temp2 = tmp_momeduc;
    temp = nancov(temp1,temp2,1);
    table(6,t) = temp(1,2)...
    ./(nanstd(temp1(isnan(temp1) == 0 & isnan(temp2) == 0),1) ...
    .* nanstd(temp2(isnan(temp1) == 0 & isnan(temp2) == 0),1));
    % SE of corr
    tmp_n = nansum(isnan(temp1) == 0 & isnan(temp2) == 0);
    se_table(6,t) = sqrt((1-table(6,t)^2)/(tmp_n-2));

    % corr(LW, dad_educ) by child age category
    temp1 = tmp_lw;
    temp2 = tmp_dadeduc;
    temp = nancov(temp1,temp2,1);
    table(7,t) = temp(1,2)...
    ./(nanstd(temp1(isnan(temp1) == 0 & isnan(temp2) == 0),1) ...
    .* nanstd(temp2(isnan(temp1) == 0 & isnan(temp2) == 0),1));
    % SE of corr
    tmp_n = nansum(isnan(temp1) == 0 & isnan(temp2) == 0);
    se_table(7,t) = sqrt((1-table(7,t)^2)/(tmp_n-2));

    % corr(LW, Y) by child age category
    temp1 = tmp_lw;
    temp2 = tmp_hhinc;
    temp = nancov(temp1,temp2,1);
    table(8,t) = temp(1,2)...
    ./(nanstd(temp1(isnan(temp1) == 0 & isnan(temp2) == 0),1) ...
    .* nanstd(temp2(isnan(temp1) == 0 & isnan(temp2) == 0),1));
    % SE of corr
    tmp_n = nansum(isnan(temp1) == 0 & isnan(temp2) == 0);
    se_table(8,t) = sqrt((1-table(8,t)^2)/(tmp_n-2));
    
    % fraction child study time / total time, average
    table(9,t) = nanmean(tmp_frac_tauc);
    
    % corr(LW, tauc) by child age category
    temp1 = tmp_lw;
    temp2 = tmp_tauc;
    temp = nancov(temp1,temp2,1);
    table(10,t) = temp(1,2)...
    ./(nanstd(temp1(isnan(temp1) == 0 & isnan(temp2) == 0),1) ...
    .* nanstd(temp2(isnan(temp1) == 0 & isnan(temp2) == 0),1));
    % SE of corr
    tmp_n = nansum(isnan(temp1) == 0 & isnan(temp2) == 0);
    se_table(10,t) = sqrt((1-table(10,t)^2)/(tmp_n-2));
end

prec1 = '%.3f'; % precision for means

diary off;
diary_dir = [cd,'\table_intro.txt'];
force_delete(diary_dir);
diary('table_intro.txt');
diary on;

disp(['\begin{table}[!htb]']);
disp(['\begin{center}']);
disp(['\caption{Data Correlations}']);
disp(['\label{table:intro}']);
disp(['\begin{tabular*}{\textwidth}{@{\extracolsep{\fill}} lcc}']);
disp(['\hline \hline']);
disp(['\multicolumn{1}{l}{\textbf{Child Ages}} \rule{0pt}{3ex}& \multicolumn{1}{c}{\textbf{9-12}} & \multicolumn{1}{c}{\textbf{13-16}}  \\']);
disp('\hline')

disp(['Letter Word Score, Child time \rule{0pt}{3ex}& ', num2str(table(10,3),prec1), ' & ', num2str(table(10,4),prec1),' \\',...
    ' & (', num2str(se_table(10,3),prec1), ') & (', num2str(se_table(10,4),prec1),') \\',...
    'Letter Word Score, Mother''s Educ. \rule{0pt}{3ex}& ', num2str(table(6,3),prec1), ' & ', num2str(table(6,4),prec1),' \\',...
    ' & (', num2str(se_table(6,3),prec1), ') & (', num2str(se_table(6,4),prec1),') \\',...
    'Letter Word Score, Father''s Educ. \rule{0pt}{3ex}& ', num2str(table(7,3),prec1), ' & ', num2str(table(7,4),prec1),' \\',...
    ' & (', num2str(se_table(7,3),prec1), ') & (', num2str(se_table(7,4),prec1),') \\',...
    'Letter Word Score, HH Income \rule{0pt}{3ex}& ', num2str(table(8,3),prec1), ' & ', num2str(table(8,4),prec1),' \\', ...
    ' & (', num2str(se_table(8,3),prec1), ') & (', num2str(se_table(8,4),prec1),') \\',...
    'Child time, Mother''s Educ. \rule{0pt}{3ex}& ', num2str(table(3,3),prec1), ' & ', num2str(table(3,4),prec1),' \\',...
    ' & (', num2str(se_table(3,3),prec1), ') & (', num2str(se_table(3,4),prec1),') \\',...
    'Child time, Father''s Educ. \rule{0pt}{3ex}& ', num2str(table(4,3),prec1), ' & ', num2str(table(4,4),prec1),' \\',...
    ' & (', num2str(se_table(4,3),prec1), ') & (', num2str(se_table(4,4),prec1),') \\',...
    'Child time, HH Income \rule{0pt}{3ex}& ', num2str(table(5,3),prec1), ' & ', num2str(table(5,4),prec1),' \\',...
    ' & (', num2str(se_table(5,3),prec1), ') & (', num2str(se_table(5,4),prec1),') \\']);
    
disp('\hline \hline');
disp('\end{tabular*}');
disp('\end{center}');
disp(['\textbf{Source}: PSID-CDS combined sample from 1997, 2002 and 2007 interviews and PSID core data between 1986 and 2010. ',...
    'To alleviate the missing data problem at young child ages, ``HH income" is defined as the average total household income within each relevant child age bin. ',...
    'Standard Errors of the correlations are between brackets, and are defined as $SE_r = \sqrt{\frac{1-r^2}{n-2}}$.']);
disp('\end{table}');
diary off;

%% Box plots of child study time, by age category

age_bin_index = nan(size(data,1),1); % 1,2,3,4
tmp_tauc_all = data(:,col_long.tau_c);

for t = 1:4
    % select the rows we need for this age group
    row_ind = (data(:,col_long.age) >= beg_age(t) & data(:,col_long.age) <= end_age(t));
    age_bin_index(row_ind,1) = t;
end
f = figure(1046574351);
set(f,'visible','off');
boxplot(tmp_tauc_all,age_bin_index, 'Labels',{'3-5','6-8','9-12','13-16'})
xlabel('Child age category')
ylabel('Hours per week')
saveas(f,'figure_intro', 'png')
saveas(f,'figure_intro', 'eps')

%% Bar charts of child study time, by age category

% 5 time categories:
edges = [-1 1 4 7 10 100];

% divide tauc by age groups
cat1 = tmp_tauc_all(age_bin_index == 1);
cat2 = tmp_tauc_all(age_bin_index == 2);
cat3 = tmp_tauc_all(age_bin_index == 3);
cat4 = tmp_tauc_all(age_bin_index == 4);
[N1,edges] = histcounts(cat1,edges,'Normalization', 'probability');
[N2,edges] = histcounts(cat2,edges,'Normalization', 'probability');
[N3,edges] = histcounts(cat3,edges,'Normalization', 'probability');
[N4,edges] = histcounts(cat4,edges,'Normalization', 'probability');

% Create overlapping bar chart

f = figure(3123359);
set(f,'visible','off');
b=bar([1 2 3 4 5], [N1' N2' N3' N4']);
legend({'Ages $3-5$','Ages $6-8$','Ages $9-12$','Ages $13-16$'},'Location','northeast','interpreter','latex')
xlabel('Hours per week')
ylabel('Relative frequency')
b(1).FaceColor = '0.2 0.2 0.2';
b(2).FaceColor = '0.4 0.4 0.4';
b(3).FaceColor = '0.7 0.7 0.7';
b(4).FaceColor = '0.9 0.9 0.9';
ax = gca;
ax.XTick = [1 2 3 4 5]; 
set(gca,'TickLabelInterpreter','latex')
ax.XTickLabels = {'$0-1$','$1-4$','$4-7$','$7-10$','$10+$'};
saveas(f,'figure_bar5', 'png')
saveas(f,'figure_bar5', 'eps')

%% Fraction of child study time
f = figure(1046574352);
set(f,'visible','off');
boxplot(frac_tauc_all,age_bin_index, 'Labels',{'3-5','6-8','9-12','13-16'})
xlabel('Child age category')
ylabel('Fraction')
saveas(f,'figure_intro2', 'png')
saveas(f,'figure_intro2', 'eps')

%% Regression using age FEs and average income and education

% 1) Define right-hand side variables
cons = ones(size(data,1),1);       % constant
educ1 = data(:,col_long.mom_educ); % mother's education
educ2 = data(:,col_long.dad_educ); % father's education

% create full set of child age dummies
% d3 = (data(:,col_long.age)==3); % dropped
d4 = (data(:,col_long.age)==4);
d5 = (data(:,col_long.age)==5);
d6 = (data(:,col_long.age)==6);
d7 = (data(:,col_long.age)==7);
d8 = (data(:,col_long.age)==8);
d9 = (data(:,col_long.age)==9);
d10 = (data(:,col_long.age)==10);
d11 = (data(:,col_long.age)==11);
d12 = (data(:,col_long.age)==12);
d13 = (data(:,col_long.age)==13);
d14 = (data(:,col_long.age)==14);
d15 = (data(:,col_long.age)==15);
d16 = (data(:,col_long.age)==16);

% Define the average (observed) income over all observed periods, separately for each household
income_avgs = NaN(size(data,1),1);
for q = 1:fp.N
    rows_i = 1+(fp.M+1)*(q-1):(fp.M+1)*q; % all 17 rows for this household
    tmp_data_i = data(rows_i, : ); % all data for this household
    Y1 = tmp_data_i(:,col_long.mom_wage).*tmp_data_i(:,col_long.mom_hours); % mother labor income
    Y2 = tmp_data_i(:,col_long.dad_wage).*tmp_data_i(:,col_long.dad_hours); % father labor income
    Y3 = tmp_data_i(:,col_long.nonlabor_income); % nonlabor income
    % replace nans by zeros for Y1 and Y2, otherwise we drop all unemployed couples
    Y1(isnan(Y1)) = 0;
    Y2(isnan(Y2)) = 0;
    tmp = Y1+Y2+Y3; % will again have nans whenever NLI = missing
    income_avgs(rows_i,1) = nanmean(tmp); % copy the avg income 17 times
end

% Compile all right-hand side variables
Xmat = [income_avgs/1000 educ1 educ2 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16 cons];

% 2) Define left-hand side variables
tauc = data(:,col_long.tau_c);
tau1 = data(:,col_long.tau1);
tau2 = data(:,col_long.tau2);
taup = tau1+tau2; 
tautot = tauc+taup;
LW = data(:,col_long.lw_raw);

% Now run all 6 regressions
[b1c,bint1c,~,~,stats] = regress(tauc,Xmat);
[b2c,bint2c,~,~,stats] = regress(tau1,Xmat);
[b3c,bint3c,~,~,stats] = regress(tau2,Xmat);
[b4c,bint4c,~,~,stats] = regress(taup,Xmat);
[b5c,bint5c,~,~,stats] = regress(tautot,Xmat);
[b6c,bint6c,~,~,stats] = regress(LW,Xmat);

mat_c = [b1c b2c b3c b4c b5c b6c];

lb_c = [bint1c(:,1) bint2c(:,1) bint3c(:,1) bint4c(:,1) bint5c(:,1) bint6c(:,1) ];
ub_c = [bint1c(:,2) bint2c(:,2) bint3c(:,2) bint4c(:,2) bint5c(:,2) bint6c(:,2) ];

% Plot the regression coefficients on father's years of education
tmp_coeff = 3; % row of father educ coefficient
tmp_selec = [1 2 3 4 5 6]; % select the models we want to plot;
x = 1:1:length(tmp_selec); 

ystart = [lb_c(tmp_coeff, tmp_selec)];
ymid = mat_c(tmp_coeff, tmp_selec); % slope coefficients
yend = [ub_c(tmp_coeff, tmp_selec)];

f = figure; 
set(f,'visible','off');
hold on;
xlim([0.5 6.5]);
for idx = 1 : numel(ystart)
    plot([x(idx) x(idx)], [ystart(idx) yend(idx)], 'k');
    plot(x(idx), ymid(idx),'kx');
end
plot([0.5 x(end)+0.5], [0 0],'k','LineWidth',2);
set(gca,'TickLabelInterpreter', 'latex');
set(gca,'XTick',[1:7])
set(gca,'XTickLabel',{'$\tau_c$','$\tau_1$','$\tau_2$','$\tau_p$','$\tau_{tot}$','LW'})
xlabel('Dependent variables');
ylabel('Estimated slope coefficient');
saveas(f,'figure_slopes_educ2_all', 'png')
saveas(f,'figure_slopes_educ2_all', 'eps')